

babies <- read.csv("Planilha de dados Babies.txt", sep = " ")

head(babies)

# Ajustes

babies <- babies[which(babies$bwt!=999,),]
babies <- babies[which(babies$gestation!=999,),]
babies <- babies[which(babies$parity!=9,),]
babies <- babies[which(babies$age!=99,),]
babies <- babies[which(babies$height!=99,),]
babies <- babies[which(babies$weight!=999,),]
babies <- babies[which(babies$smoke!=9,),]

## Variveis importantes

# gestation
lm1 <- lm(babies$bwt~babies$gestation)
anova(lm1)
plot(babies$bwt~babies$gestation)
abline(lm1)

# parity 
lm2 <- lm(babies$bwt~babies$parity)
anova(lm2)
plot(babies$bwt~babies$parity)
abline(lm2)

# age 
lm3 <- lm(babies$bwt~babies$age)
anova(lm3)
plot(babies$bwt~babies$age)
abline(lm3)

# height
lm4 <- lm(babies$bwt~babies$height)
anova(lm4)
plot(babies$bwt~babies$height)
abline(lm4)

# weight
lm5 <- lm(babies$bwt~babies$weight)
anova(lm5)
plot(babies$bwt~babies$weight)
abline(lm5)

# smoke
lm6 <- lm(babies$bwt~babies$smoke)
anova(lm6)
plot(babies$bwt~babies$smoke)
abline(lm6)

# Variveis preditoras:
# gestation
# height
# weight
# smoke
# Interaes biologicamente relevantes:
# gestation:height
# gestation:weight
# gestation:smoke
# height:weight
# weight:smoke

# Modelo com variveis preditoras, mas sem interaes:
lm7 <- lm(bwt~gestation + height + weight + smoke, data = babies)
summary(lm7)

# Acrscentando interaes:
lmfull <- lm(bwt~gestation + height + weight + smoke + gestation:height 
             + gestation:weight + gestation:smoke + height:weight + 
               weight:smoke, data = babies)
summary(lmfull)
anova(lmfull)

# Simplificando o modelo: 
## Remoo das interaes e comparao com lmfull ou com o modelo 
## retido.A ordem de remoo das interaes foi daquela com maior 
## p-valor (menos significante) para menor p-valor (mais significante),
##  de acordo com summary(lmfull).

#lmbabies1 = lmfull - weight:smoke
lmbabies1 <-lm(bwt~gestation + height + weight + smoke + gestation:height
               + gestation:weight + gestation:smoke + 
                 height:weight, data = babies)
summary(lmbabies1)
anova(lmfull, lmbabies1)
# p-valor>0.05 -> Mantm modelo mais simples

#lmfull2 = lmfull1 - height:weight
lmbabies2 <-lm(bwt~gestation + height + weight + smoke + gestation:height
               + gestation:weight + gestation:smoke, data = babies)
summary(lmbabies2)
anova(lmbabies1, lmbabies2)
# p-valor>0.05 -> Mantm modelo mais simples

#lmfull3 = lmfull2 - gestation:height
lmbabies3 <-lm(bwt~gestation + height + weight + smoke + gestation:weight 
               + gestation:smoke, data = babies)
summary(lmbabies3)
anova(lmbabies2, lmbabies3)
# p-valor>0.05 -> Mantm modelo mais simples

#lmfull4 = lmfull3 - gestation:weight
lmbabies4 <-lm(bwt~gestation + height + weight + smoke + gestation:smoke, 
               data = babies)
summary(lmbabies4)
anova(lmbabies3, lmbabies4)
# p-valor>0.05 -> Mantm modelo mais simples 

#lmfull5 = lmfull4 - gestation:smoke
lmbabies5 <-lm(bwt~gestation + height + weight + smoke, data = babies)
summary(lmbabies5)
anova(lmbabies4, lmbabies5)
# p-valor<0.05 -> Mantm modelo mais complexo.

## A partir do modelo retido, lmbabies4, remove-se, uma a uma, as 
## variaveis preditoras, com o mesmo critrio de remoo das interaes

#lmbabies6 = lmbabies4 - weight
lmbabies6 <-lm(bwt~gestation + height + smoke + gestation:smoke, 
               data = babies)
summary(lmbabies6)
anova(lmbabies4, lmbabies6)
# p-valor<0.05 -> retenho modelo mais complexo 

#lmbabies7 = lmbabies4 - smoke
lmbabies7 <-lm(bwt~gestation + height + weight + gestation:smoke, 
               data = babies)
summary(lmbabies7)
anova(lmbabies7, lmbabies4)
# p-valor<0.05 -> retenho modelo mais complexo 

#lmbabies9 = lmbabies4 - height
lmbabies8 <-lm(bwt~gestation + weight + smoke + gestation:smoke, 
               data = babies)
summary(lmbabies4)
anova(lmbabies8, lmbabies4)
# p-valor<0.05 -> retenho modelo mais complexo 

#lmbabies9 = lmbabies4 - height
lmbabies9 <-lm(bwt~weight + smoke + height + gestation:smoke, 
               data = babies)
summary(lmbabies9)
anova(lmbabies9, lmbabies4)
# p-valor<0.05 -> retenho modelo mais complexo 

### Concluso: o modelo selecionado foi lmbabies4: 
### lmbabies4 <-lm(bwt~gestation + height + weight + smoke + 
###                 gestation:smoke, data = babies)

# Diagnstico do modelo:
par(mfrow = c(2,2))
plot(lmbabies4)
